import pandas as pd
import gc
from bokeh.charts import Histogram, BoxPlot, output_file, show
from bokeh.charts import output_notebook
output_notebook()
df = pd.read_csv("../../../data/play_track.csv", sep=";")
Counting the songs for each playlist
playlist_len = df.groupby("pid").apply(len)
Removing duplicate songs from the same playlist and then counting
unique_pid_track = df[["pid", "track_uri"]].drop_duplicates()
playlist_unique_len = unique_pid_track.groupby("pid")["track_uri"].apply(len)
unique_pid_track = None
df = None
gc.collect()
Calculating the number of repeated songs for each playlist
diff = (playlist_len - playlist_unique_len)
prop = diff / playlist_len
"Proportion of playlists with repeated songs: {}".format((diff > 0).sum() / len(diff))
p = Histogram(playlist_len, title="Playlist length", width=600, height=300, tools=["save", "xpan", "xwheel_zoom"])
show(p)
p = Histogram(diff[diff > 0], title="Repeated songs by playlist", width=600, height=300, tools=["save", "xpan", "xwheel_zoom"])
show(p)
p = Histogram(prop[prop > 0], title="Repeated songs by playlist", width=600, height=300, tools=["save", "xpan", "xwheel_zoom"])
show(p)
prop.describe()
Tales Pimentel
tales.tsp@gmail.com